Imports and custom functions

In [1]:
import pandas as pd
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
plotly.offline.init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
In [2]:
def prepare_df(df_path, df_name):

    """Read in information from dataframe"""

    # read in df1
    df = pd.read_csv(df_path, header=0, sep="\t")
    # Mark significants
    df["Significant"]  = "No"
    df.loc[(df["FDR"] < 0.05) & (df["logCPM"]>2), "Significant"] = "FDR<0.05\nlogCPM>2"
    # set id as index
    df.set_index("id", inplace=True)
    # rename columns
    df.columns = df.columns + "_" + df_name
    # reset index
    df.reset_index(inplace=True)

    return df
In [3]:
def plot_MA(df, df_name):

    """Plot MA"""

    sns.set(font_scale=2)
    sns_plot = sns.lmplot(x=f"logCPM_{df_name}",
                          y=f"logFC_{df_name}",
                          data=df,
                          fit_reg=False,
                          hue=f"Significant_{df_name}",
                          palette=["black", "red"],
                          height=15,
                          aspect=1)
    ax = plt.gca()
    ax.set_title(f"MA plot for {df_name}")
In [4]:
def merge_df(df_1, df_2):
    
    """Merge dataframes"""
    
    df = pd.merge(df_1, df_2, on="id")
    df["Significant"] = "No"
    return df
In [5]:
def plot_FC_correlation(df, FC_1_name, FC_2_name, hue):

    """Plot correlation between FCs"""

    sns.set(font_scale=2)
    sns_plot = sns.lmplot(x=f"logFC_{FC_1_name}",
                          y=f"logFC_{FC_2_name}",
                          data=df,
                          fit_reg=False,
                          hue=hue,
                          palette=["black", "red"],
                          height=15,
                          aspect=1)
    ax = plt.gca()
    ax.set_title(f"FCs between {FC_1_name} and {FC_2_name}")
In [6]:
def compare(df_1_path, df_2_path, df_1_name, df_2_name):
    
    "Combine multiple functions"
    
    df_1 = prepare_df(df_1_path, df_1_name)
    plot_MA(df_1, df_1_name)
    df_2 = prepare_df(df_2_path, df_2_name)
    plot_MA(df_2, df_2_name)
    
    df_merged = merge_df(df_1, df_2)
    
    plot_FC_correlation(df_merged, df_1_name, df_2_name, hue="Significant")
    plot_FC_correlation(df_merged, df_1_name, df_2_name, hue="Significant_" + df_1_name)
    plot_FC_correlation(df_merged, df_1_name, df_2_name, hue="Significant_" + df_2_name)
    
    return df_merged

Input files

In [7]:
ip_path = "../WAGO_3_IP/results/DE__N2_Input_Rpph__N2_xf119_Rpph/plot_small_RNAs/22G.tsv"
ip_name = "IP"

Dpf_3_null_path = "../small_RNA_seq_15_C/results/DE__WT_dpf3__Dpf_3_null/plot_small_RNAs/22G.tsv"
Dpf_3_null_name = "Dpf_3_null"

Dpf_3_S784A_path = "../small_RNA_seq_15_C/results/DE__WT_dpf3__Dpf_3_S784A//plot_small_RNAs/22G.tsv"
Dpf_3_S784A_name = "Dpf_3_S784A"

mut_2_path = "../small_RNA_seq_15_C/results/DE__WT_other__mut_2/plot_small_RNAs/22G.tsv"
mut_2_name = "mut_2"

mut_7_path = "../small_RNA_seq_15_C/results/DE__WT_other__mut_7/plot_small_RNAs/22G.tsv"
mut_7_name = "mut_7"

Dpf_3_null_first_path = "../small_RNA_seq_15_C/results/DE__WT_dpf3_first__Dpf_3_null/plot_small_RNAs/22G.tsv"
Dpf_3_null_first_name = "Dpf_3_null_first"

Dpf_3_S784A_second_path = "../small_RNA_seq_15_C/results/DE__WT_dpf3_second__Dpf_3_S784A/plot_small_RNAs/22G.tsv"
Dpf_3_S784A_second_name = "Dpf_3_S784A_second"

Dpf_3_null_old_path = "../small_RNA_seq_25_C/results/DE__WT__dpf_3_delta/plot_small_RNAs/22G.tsv"
Dpf_3_null_old_name = "Dpf_3_null_old"

dpf_3S784A_old_path = "../small_RNA_seq_25_C/results/DE__WT__dpf_3S784A/plot_small_RNAs/22G.tsv"
dpf_3S784A_old_name = "Dpf_3_S784A_old"

Comparisons

1. IP vs Dpf_3_null

In [8]:
df_1_path = ip_path
df_2_path = Dpf_3_null_path
df_1_name = ip_name
df_2_name = Dpf_3_null_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)

df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]>0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}>0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
1816

2. IP vs Dpf_3_S784A

In [9]:
df_1_path = ip_path
df_2_path = Dpf_3_S784A_path
df_1_name = ip_name
df_2_name = Dpf_3_S784A_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)

df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]>0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}>0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
2169

3. IP vs mut_2

In [10]:
df_1_path = ip_path
df_2_path = mut_2_path
df_1_name = ip_name
df_2_name = mut_2_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)

df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]>0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}>0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
2989

4. IP vs mut_7

In [11]:
df_1_path = ip_path
df_2_path = mut_7_path
df_1_name = ip_name
df_2_name = mut_7_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)

df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]>0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}>0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
2439

5.a. Dpf_3_null vs Dpf_3_S784A with same WTs (It's wrong but I leave it, just to see the differences)

In [12]:
df_1_path = Dpf_3_null_path
df_2_path = Dpf_3_S784A_path
df_1_name = Dpf_3_null_name
df_2_name = Dpf_3_S784A_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)
In [13]:
df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]<0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}<0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
3073

5.b. Dpf_3_null vs Dpf_3_S784A with different WTs

In [14]:
df_1_path = Dpf_3_null_first_path
df_2_path = Dpf_3_S784A_second_path
df_1_name = Dpf_3_null_first_name
df_2_name = Dpf_3_S784A_second_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)
In [15]:
df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]<0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}<0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
1513

6. Dpf_3_null vs mut_2

In [16]:
df_1_path = Dpf_3_null_path
df_2_path = mut_2_path
df_1_name = Dpf_3_null_name
df_2_name = mut_2_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)
In [17]:
df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]<0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}<0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
2793

7. Dpf_3_null vs mut_7

In [18]:
df_1_path = Dpf_3_null_path
df_2_path = mut_7_path
df_1_name = Dpf_3_null_name
df_2_name = mut_7_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)
In [19]:
df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]<0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}<0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
2372

8. Dpf_3_Dpf_3_S784A vs mut_2

In [20]:
df_1_path = Dpf_3_S784A_path
df_2_path = mut_2_path
df_1_name = Dpf_3_S784A_name
df_2_name = mut_2_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)
In [21]:
df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]<0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}<0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
3337

9. Dpf_3_Dpf_3_S784A vs mut_7

In [22]:
df_1_path = Dpf_3_S784A_path
df_2_path = mut_7_path
df_1_name = Dpf_3_S784A_name
df_2_name = mut_7_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)
In [23]:
df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]<0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}<0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
2823

10. Dpf_3_null vs Dpf_3_null_old

In [24]:
df_1_path = Dpf_3_null_path
df_2_path = Dpf_3_null_old_path
df_1_name = Dpf_3_null_name
df_2_name = Dpf_3_null_old_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)
In [25]:
df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]<0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}<0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
1090

11. Dpf_3_Dpf_3_S784A vs mut_7

In [26]:
df_1_path = Dpf_3_S784A_path
df_2_path = dpf_3S784A_old_path
df_1_name = Dpf_3_S784A_name
df_2_name = dpf_3S784A_old_name

df_all = compare(df_1_path, df_2_path, df_1_name, df_2_name)
In [27]:
df_all["Significant"] = "No"
df_all.loc[(df_all[f"FDR_{df_1_name}"]<0.05) & 
           (df_all[f"FDR_{df_2_name}"]<0.05) &
           (df_all[f"logFC_{df_1_name}"]<0) &
           (df_all[f"logFC_{df_2_name}"]<0) &
           (df_all[f"logCPM_{df_1_name}"]>2) &
           (df_all[f"logCPM_{df_2_name}"]>2)
           , "Significant"] = f"FDR_{df_1_name}<0.05\nFDR_{df_2_name}<0.05\nlogFC_{df_1_name}<0\nlogFC_{df_2_name}<0\nlogCPM_{df_1_name}>2\nlogCPM_{df_2_name}>2"

print(len(df_all[df_all["Significant"] != "No"]))

plot_FC_correlation(df_all, df_1_name, df_2_name, hue="Significant")
1233
In [ ]:
 
In [28]:
# Dpf_3_S784A_selected_22G_RNAs = Dpf_3_S784A[(Dpf_3_S784A["Significant_Dpf_3_S784A"] == "FDR<0.05\nlogCPM>2") & (Dpf_3_S784A["logFC_Dpf_3_S784A"] < 0)]["id"].tolist()
In [29]:
# IP_selected_22G_RNAs = ip[(ip["Significant_IP"] == "FDR<0.05\nlogCPM>2") & (ip["logFC_IP"] > 0)]["id"].tolist()
In [30]:
# len(IP_selected_22G_RNAs), len(Dpf_3_S784A_selected_22G_RNAs)
In [31]:
# overlap = [value for value in Dpf_3_S784A_selected_22G_RNAs if value in IP_selected_22G_RNAs]
In [32]:
# len(overlap)
In [ ]: